Creating a logistic regression model with Theano

Based on Theano's tutorial.



In [2]:

    
# Importing libraries

import pandas as pd

import numpy as np

import theano
import theano.tensor as T

import matplotlib.pyplot as plt
%pylab inline 

import cPickle









    



Populating the interactive namespace from numpy and matplotlib



In [3]:

    
# Loading data

df = pd.read_csv('../data/train.csv')
df = df.astype(np.float64)



In [4]:

    
# Exploring data

print df.shape
df.iloc[0:10,0:10]









    



(42000, 785)






    Out[4]:






  
    
      
      label
      pixel0
      pixel1
      pixel2
      pixel3
      pixel4
      pixel5
      pixel6
      pixel7
      pixel8
    
  
  
    
      0
       1
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      1
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      2
       1
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      3
       4
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      4
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      5
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      6
       7
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      7
       3
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      8
       5
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      9
       3
       0
       0
       0
       0
       0
       0
       0
       0
       0



In [5]:

    
# Showing some data

f, (ax1, ax2) = plt.subplots(ncols=2)
image_size = (28,28)
ax1.matshow( np.reshape(df.iloc[0,1:],image_size), cmap='gray_r')
ax2.matshow( np.reshape(df.iloc[7,1:],image_size), cmap='gray_r')
f.show()









    



/usr/lib/python2.7/dist-packages/matplotlib/figure.py:387: UserWarning: matplotlib is currently using a non-GUI backend, so cannot show the figure
  "matplotlib is currently using a non-GUI backend, "



In [6]:

    
# Organizing data

N = df.shape[0]
features = df.shape[1]-1
outputs = 10

def hot_vector_from_category_num(x):
    v = [0]*10
    v[int(x)] = 1
    return v

D = (
    df.iloc[:,1:],
    np.array(map(hot_vector_from_category_num, df.iloc[:,0]))
    )



In [7]:

    
# Making a logistic regression model with Theano

class LogisticRegressionModel(object):
    def __init__(self, inputs, outputs, learning_rate = 0.1):
        self.x = T.matrix('x') #Inputs
        self.y = T.matrix('y') #Output
        
        # Weights and bias     
        self.w = theano.shared(np.random.randn(inputs, outputs), name='w')
        self.b = theano.shared(np.zeros(outputs), name='b')

        
        # Theano expression graph

        ## Probability that target=1
        self.p_1 = ( 1.0 / (1.0 + T.exp(-T.dot(self.x,self.w)-self.b)) )

        ## The prediction thresholded
        #self.prediction = self.p_1 >= 0.5
        self.prediction = self.p_1 >= T.max(self.p_1)
        #self.prediction = self.p_1

        ## Cross-entropy loss function
        self.cross_entropy = -self.y*T.log(self.p_1) - (1-self.y)*T.log(1-self.p_1)

        ## The cost to minimize
        self.cost = self.cross_entropy.mean() + 0.01 * (self.w**2).sum()

        ## Computing the gradient of the cost
        self.gw, self.gb = T.grad(self.cost, [self.w, self.b])


        # Compiling the graph
        self.train = theano.function(
            inputs = [self.x, self.y],
            outputs = [self.prediction, self.cross_entropy],
            updates = ((self.w, self.w-learning_rate*self.gw), (self.b, self.b-learning_rate*self.gb))
        )
        
        self.predict = theano.function(
            inputs = [self.x],
            outputs = self.prediction
        )



In [8]:

    
# Creating a model instance and training it

print 'Creating model...'
my_model = LogisticRegressionModel(features, outputs)

print 'Training...'
training_steps = 1
for i in xrange(training_steps):
    pred, err = my_model.train(D[0], D[1])









    



Creating model...
Training...



In [9]:

    
# Checking percentage of error

pred_max = pred >= np.max(pred)
print np.sum(pred_max - D[1])/float(N)









    



4.24004761905



In [10]:

    
# Saving model

model_file = file('log_reg.model', 'wb')
cPickle.dump(my_model, model_file, protocol=cPickle.HIGHEST_PROTOCOL)
model_file.close()



In [11]:

    
# Loading a copy of the model

model_file = file('log_reg.model', 'rb')
another_model = cPickle.load(model_file)
model_file.close()

# Trying it
print another_model.predict(D[0])









    



[[0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]

	label	pixel0	pixel1	pixel2	pixel3	pixel4	pixel5	pixel6	pixel7	pixel8
0	1	0	0	0	0	0	0	0	0	0
1	0	0	0	0	0	0	0	0	0	0
2	1	0	0	0	0	0	0	0	0	0
3	4	0	0	0	0	0	0	0	0	0
4	0	0	0	0	0	0	0	0	0	0
5	0	0	0	0	0	0	0	0	0	0
6	7	0	0	0	0	0	0	0	0	0
7	3	0	0	0	0	0	0	0	0	0
8	5	0	0	0	0	0	0	0	0	0
9	3	0	0	0	0	0	0	0	0	0

	label	pixel0	pixel1	pixel2	pixel3	pixel4	pixel5	pixel6	pixel7	pixel8
0	1	0	0	0	0	0	0	0	0	0
1	0	0	0	0	0	0	0	0	0	0
2	1	0	0	0	0	0	0	0	0	0
3	4	0	0	0	0	0	0	0	0	0
4	0	0	0	0	0	0	0	0	0	0
5	0	0	0	0	0	0	0	0	0	0
6	7	0	0	0	0	0	0	0	0	0
7	3	0	0	0	0	0	0	0	0	0
8	5	0	0	0	0	0	0	0	0	0
9	3	0	0	0	0	0	0	0	0	0

	label	pixel0	pixel1	pixel2	pixel3	pixel4	pixel5	pixel6	pixel7	pixel8
0	1	0	0	0	0	0	0	0	0	0
1	0	0	0	0	0	0	0	0	0	0
2	1	0	0	0	0	0	0	0	0	0
3	4	0	0	0	0	0	0	0	0	0
4	0	0	0	0	0	0	0	0	0	0
5	0	0	0	0	0	0	0	0	0	0
6	7	0	0	0	0	0	0	0	0	0
7	3	0	0	0	0	0	0	0	0	0
8	5	0	0	0	0	0	0	0	0	0
9	3	0	0	0	0	0	0	0	0	0